Load regions and spatial FIPS

#load health officer regions
healthofficer_shortname <- tribble(~region_acronym, ~region_long, ~ region_full,
  "ABAHO",      "Association of Bay Area Health Officers (ABAHO)",  "Association of Bay Area Health Officers (ABAHO)",    
  "GSRHO",      "Greater Sacramento Region Health Officers", "Greater Sacramento Region Health Officers (GSRHO)",
  "RANCHO",     "Rural Association of Northern California Health Officers (RANCHO)", "Rural Association of Northern California Health Officers (RANCHO)",
  "SJVC",       "San Joaquin Valley Consortium (SJVC)", "San Joaquin Valley Consortium (SJVC)",
  "SCAL",       "Southern California Health Officers", "Southern California Health Officers (SCAL)"
)

 regions <- read_csv("data/CAregions.csv") %>% 
    select( county, region_healthofficer , dof_pop_county) %>%
    rename( region_long=region_healthofficer) %>%
    left_join( healthofficer_shortname, by = "region_long") %>% na.omit()

#load data for spatial plotting
spdf <- geojsonsf::geojson_sf("data/counties.geojson")
county_fips<-data.frame(county= spdf$NAME_1, county_fips= spdf$COUNTYFI_1)

Standardized color palette and labels for plotting

MAE_map_col<-viridis_pal()(6)
# MAE_map_col<-rev(map_col[1:10])
names(MAE_map_col) <- c("columbia", "covid_nearterm", "m.proj", "lemma", "simple", "CA-baseline")
gglabels_MAE<-c("columbia"="Columbia", "covid_nearterm"= "COVID Near-Term", "m.proj"="Ensemble", "lemma"="Lemma",   "simple"="Simple Growth", "CA-baseline"="CA-baseline")

forecaster_map_col<-viridis_pal()(6)
# MAE_map_col<-rev(map_col[1:10])
names(forecaster_map_col) <- c("Columbia", "COVID Nearterm", "Ensemble", "LEMMA", "Simple Growth", "CA-baseline")

region_col<- met.brewer("Hokusai1", 5)
names(region_col)<-c("Association of Bay Area Health Officers (ABAHO)", "Greater Sacramento Region Health Officers (GSRHO)", "Rural Association of Northern California Health Officers (RANCHO)", "San Joaquin Valley Consortium (SJVC)", "Southern California Health Officers (SCAL)")   
                     
region_col_abr<- met.brewer("Hokusai1", 5)
names(region_col_abr)<-c("ABAHO", "GSRHO", "RANCHO", "SJVC", "SCAL") 

num_forecasts_col<-met.brewer("Hiroshige", 5)

Figure 1. Plot hospitalizations and variant prevalence to identify periods of interest

#define start and end dates for analysis period
alpha_start<- as.Date("2021-04-21")
alpha_end<- as.Date("2021-06-01")
delta_start<- as.Date("2021-07-21")
delta_end<- as.Date("2021-09-01")
omicron_start<- as.Date("2021-12-21")
omicron_end<- as.Date("2022-02-01")

variant_dates <- tribble(
  ~variant_period, ~start_date, ~end_date, 
  "alpha_frac", alpha_start, alpha_end,
  "delta_frac", delta_start, delta_end,
  "omicron_frac", omicron_start, omicron_end
)


#define color palette and labels for variants
variant_cols <- met.brewer("Greek", 5)
names(variant_cols) <- c("alpha_frac", "delta_frac", "gamma_frac", 
 "omicron_frac", "other_frac")
variant_labels <- c("alpha_frac"= "Alpha", "delta_frac"= "Delta" , "gamma_frac"= "Gamma", 
 "omicron_frac"="Omicron", "other_frac"="Other")

#actual hospitalizations
actuals <- fread("data/COVID_hospital_census.csv") %>%  mutate(index = mdy( Most.Recent.Date)) %>% rename(county=County.Name) %>% left_join(regions, by= "county") %>%
  group_by( index ) %>% select(county, region=region_acronym, index, hospitalized= COVID.19.Positive.Patients)

actuals_state<- actuals %>% group_by( index ) %>% summarize(hospitalized= sum(hospitalized, na.rm=TRUE))

A<-actuals_state %>% filter(index> as.Date("2021-02-01"), index<as.Date("2022-02-01")) %>% ggplot() + geom_line(aes(x=index, y=hospitalized)) + theme_classic()+ xlab("Date")+ ylab("Hospitalization Census")+ 
   geom_rect(aes(xmin = start_date, xmax = end_date, fill = variant_period), 
    ymin = -Inf, ymax = Inf, alpha = 0.3, 
    data = variant_dates)+ 
  scale_fill_manual(name="Analysis\nPeriod", labels=variant_labels[names(variant_cols) %in% unique(variant_dates$variant_period)], values=variant_cols[names(variant_cols) %in% unique(variant_dates$variant_period)] )
  



variants <-  fread("data/region_variants_summary.csv") %>% mutate(index=as.Date(index)) 

variants_state<- variants %>% filter(region== "California", index> as.Date("2021-02-01") & index < as.Date("2022-02-01")) %>% select(index, region, alpha_frac, gamma_frac, omicron_frac, delta_frac, other_frac) %>% pivot_longer(!c(region, index), names_to = "Variant", values_to = "fraction")

B<- variants_state %>% ggplot()+ geom_line(aes(x=index, y=fraction, col=Variant))+ xlab("Date")+ ylab("Variant Prevalence") + theme_classic()+
  scale_color_manual(values=variant_cols, labels = variant_labels)+
 #  scale_color_manual(values=met.brewer("Greek", 5), labels = c("alpha_frac"= "Alpha", "delta_frac"= "Delta" , "gamma_frac"= "Gamma", 
 # "omicron_frac"="Omicron", "other_frac"="Other"))+
geom_rect(data = variant_dates, aes(xmin = start_date, xmax = end_date, fill = variant_period), ymin = -Inf, ymax = Inf, alpha = 0.3)+  scale_fill_manual(name="Analysis\nPeriod", labels=variant_labels[names(variant_cols) %in% unique(variant_dates$variant_period)], values=variant_cols[names(variant_cols) %in% unique(variant_dates$variant_period)] )+
  guides(fill="none")

reff_state <- read_csv("data/state_reff.csv") %>%
  filter(model == "m.rt") %>% mutate( location = "CA", location_level= "state") 
C<- reff_state %>% filter(index> as.Date("2021-02-01"), index< as.Date("2022-02-01")) %>% ggplot()+ geom_line(aes(x=index, y=estimate))+ geom_hline(yintercept=1, lty="dashed")+ xlab("Date")+ ylab("R-effective Estimate") + theme_classic()+
  scale_color_manual(values=variant_cols, labels = variant_labels)+
geom_rect(data = variant_dates, aes(xmin = start_date, xmax = end_date, fill = variant_period), ymin = -Inf, ymax = Inf, alpha = 0.3)+  scale_fill_manual(name="Analysis\nPeriod", labels=variant_labels[names(variant_cols) %in% unique(variant_dates$variant_period)], values=variant_cols[names(variant_cols) %in% unique(variant_dates$variant_period)] )+
  guides(fill="none")

col_one<- cowplot::plot_grid(A, B, C, labels="AUTO", ncol=1, align="hv", axis="lr")

D<- regions %>% left_join(county_fips, by="county") %>%
  left_join(urbnmapr::counties, by = "county_fips") %>% 
  filter(state_name =="California") %>% 
  ggplot(mapping = aes(long, lat, group = group, fill = region_full)) +
  geom_polygon(color = "#ffffff", size = .25) +
scale_fill_manual(name = "Health Officer Regions", values= region_col)+
  coord_map(projection = "albers", lat0 = 39, lat1 = 45) +
  guides(fill = guide_legend(title.position="top", nrow = 6)) +
  theme_classic()+
  xlab(NULL)+ ylab(NULL)+
    theme(axis.line = element_blank(),
    plot.background = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank(),
    legend.position="bottom")

cowplot::plot_grid(col_one, D, labels=c("", "D"), ncol=2, rel_widths=c(2, 1))

Import and Clean Socio-Economic Data Sets

CA county data sets from: https://www.ers.usda.gov/data-products/county-level-data-sets/

#Socio-economic county-level data
#daily vax record
vx_odp_raw <-read.csv("https://data.chhs.ca.gov/dataset/e283ee5a-cf18-4f20-a92c-ee94a2866ccd/resource/130d7ba2-b6eb-438d-a412-741bde207e1c/download/covid19vaccinesbycounty.csv") %>% mutate(index = as.Date(administered_date)+ 14) #Add two week lag beyond administration date to get at "fully vaccinated" status

vx <- vx_odp_raw %>% arrange(index, county) %>% filter(index>=as.Date("2021-01-01"), county %notin% c("All CA and Non-CA Counties", "All CA Counties", "Unknown")) %>%  left_join(regions) %>% 
  mutate(perc_COVIDvx= cumulative_fully_vaccinated/dof_pop_county) %>% select(index, county, perc_COVIDvx)

poverty<- read_xlsx("data/PovertyReport.xlsx", skip=4) %>% select(county= Name, RUC_Code=`RUC Code`, perc_pov= `Percent...7`) %>% filter(county!= "California")  

unemploy<-read_xlsx("data/UnemploymentReport.xlsx", skip=2) %>% select(county= Name, perc_unemploy= `2020`, income=`Median Household Income (2019)`) %>% filter(county!= "California") %>% mutate(county = gsub(" County, CA", "", county), county = gsub(" County/city, CA", "", county))  

edu<-read_xlsx("data/EducationReport.xlsx", skip=2) %>% select(county= Name, education=`2015-2019`) %>% filter(county!= "California") %>% mutate(county = gsub(", CA", "", county))    

reff_curve_cnty <- fread("data/cnty_reff.csv") %>%
     filter(model=="m.rt") %>% select(-model, -archivedt) %>% mutate(index=as.Date(index)) 

reff_dates <- seq(as.Date(as.Date("2021-01-01")), max(reff_curve_cnty$index, na.rm = T), by = "1 day")

reff_change <- bind_rows(lapply(reff_dates, function(t){
  reff_diff <- reff_curve_cnty %>% filter(index %in% c(t, t - lubridate::days(7))) %>%
    group_by(county) %>%
    summarize(low = first(estimate), high = last(estimate)) %>%
    mutate(diff = high - low, index = t)
})) %>% select(index, county, diff)

7 days

Score model performance using fractional ranking

score_cards_tourney <-fread("results/score_cards_2021Jan1_2022Feb1_allMAE_baseline.csv") %>% filter(model %in% c("columbia", "lemma", "m.proj", "covid_nearterm", "simple", "CA-baseline"), geo_value %notin% c("Alpine", "Sierra", "Sutter")) %>% rename(county=geo_value)  %>% mutate(index=as.Date(forecast_date), target_end_date=as.Date(target_end_date)) %>% filter(forecast_date>="2021-02-01", target_end_date <=omicron_end)

#Update MAE definitions here as needed (MAE_7day, MAE_14day, MAE_21day, MAE_28day) and filter by appropriate ahead/delay (e.g., 7, 14, 21, 28)
delay<-7
MAE_ranks_long <- score_cards_tourney %>% filter(ahead <=delay) %>% group_by(county, index) %>% select(index, county, model, forecaster, MAE=MAE_7day, norm_MAE=norm_MAE_7day)  %>% distinct() %>% mutate(MAE_rank=dense_rank(desc(-MAE)), norm_MAE_rank=dense_rank(desc(-norm_MAE)))

max_models<-MAE_ranks_long %>% group_by(county, index)  %>% summarize(models_per_day=max(norm_MAE_rank, na.rm=TRUE))

fractional_ranks<- MAE_ranks_long %>% left_join(max_models, by=c("county", "index")) %>% mutate(fraction_rank= 1- (norm_MAE_rank-1)/models_per_day)
daily_winner<-fractional_ranks %>% group_by(county, index) %>% top_n(n=-1, norm_MAE_rank) %>% select(-norm_MAE_rank)
county_sums<-fractional_ranks %>% group_by(county, model) %>% summarize(county_sum= sum(fraction_rank, na.rm=TRUE))
county_sums_wide<-county_sums %>% pivot_wider(names_from="model", values_from = "county_sum")

alpha_ranks <- fractional_ranks %>% filter(index > as.Date(alpha_start) & index < as.Date(alpha_end))
alpha_daily_winner<-alpha_ranks %>% group_by(county, index) %>% top_n(n=-1, norm_MAE_rank) %>% select(-norm_MAE_rank)
alpha_county_by_model <-alpha_ranks %>% group_by(county, model) %>% summarize(sum_fraction=sum(fraction_rank, na.rm=TRUE))
alpha_winner<- alpha_county_by_model %>% top_n(n=1, sum_fraction) %>% select(-sum_fraction)
alpha_model_count<- alpha_ranks %>% na.omit() %>% group_by(model)%>% summarize(model_count=n())
alpha_date_range<-length(unique(alpha_ranks$index))
num_counties<-length(unique(alpha_ranks$county))
alpha_model_sum<-alpha_ranks %>% group_by(model) %>% summarize(fraction_rank=sum(fraction_rank, na.rm=TRUE)) %>% left_join(alpha_model_count, by="model") %>% 
mutate(normalized_score=fraction_rank/(model_count)) %>% filter(normalized_score>0)


delta_ranks <- fractional_ranks %>% filter(index > as.Date(delta_start) & index < as.Date(delta_end))
delta_daily_winner<-delta_ranks %>% group_by(county, index) %>% top_n(n=-1, norm_MAE_rank) %>% select(-norm_MAE_rank)
delta_county_by_model <-delta_ranks %>% group_by(county, model) %>% summarize(sum_fraction=sum(fraction_rank, na.rm=TRUE))
delta_winner<- delta_county_by_model %>% top_n(n=1, sum_fraction) %>% select(-sum_fraction)
delta_model_count<- delta_ranks %>% na.omit() %>% group_by(model)%>% summarize(model_count=n())
delta_date_range<-length(unique(delta_ranks$index))
num_counties<-length(unique(delta_ranks$county))
delta_model_sum<-delta_ranks %>% group_by(model) %>% summarize(fraction_rank=sum(fraction_rank, na.rm=TRUE)) %>% left_join(delta_model_count, by="model") %>% 
mutate(normalized_score=fraction_rank/(model_count)) %>% filter(normalized_score>0)


omicron_ranks <- fractional_ranks %>% filter(index > as.Date(omicron_start) & index < as.Date(omicron_end-delay))
omicron_daily_winner<-omicron_ranks %>% group_by(county, index) %>% top_n(n=-1, norm_MAE_rank) %>% select(-norm_MAE_rank)
omicron_county_by_model <-omicron_ranks %>% group_by(county, model) %>% summarize(sum_fraction=sum(fraction_rank, na.rm=TRUE))
omicron_winner<- omicron_county_by_model %>% top_n(n=1, sum_fraction) %>% select(-sum_fraction)
omicron_model_count<- omicron_ranks %>% na.omit() %>% group_by(model)%>% summarize(model_count=n())
omicron_date_range<-length(unique(omicron_ranks$index))
num_counties<-length(unique(omicron_ranks$county))
omicron_model_sum<-omicron_ranks %>% group_by(model) %>% summarize(fraction_rank=sum(fraction_rank, na.rm=TRUE)) %>% left_join(omicron_model_count, by="model") %>% 
mutate(normalized_score=fraction_rank/(model_count)) %>% filter(normalized_score>0)

Alpha period: Plot maps of best performing models by county

MAE_map_alpha<-alpha_winner %>% left_join(county_fips, by="county") %>%
  left_join(urbnmapr::counties, by = "county_fips") %>% 
  filter(state_name =="California") %>% 
  ggplot(mapping = aes(long, lat, group = group, fill = model)) +
  geom_polygon(color = "#ffffff", size = .25) +
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(alpha_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(alpha_winner$model)] )+
  coord_map(projection = "albers", lat0 = 39, lat1 = 45) +
  guides(fill = "none") +
  theme_classic()+
  xlab(NULL)+ ylab(NULL)+
    theme(axis.line = element_blank(),
    plot.background = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank())


#version 1- histogram of winning models by county
MAE_hist_v2<-alpha_winner  %>% ggplot(aes(model)) + geom_histogram(stat="count", aes(fill=model))+ 
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(alpha_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(alpha_winner$model)] )+
  scale_x_discrete(name="Model Type", labels= NULL)+
  ylab("Count")+
  # guides(fill = "none") +
  theme_classic()

#version 2- histogram of normalized ranking scores
MAE_hist<-alpha_model_sum  %>% ggplot(aes(model)) + geom_col(aes(x=model, y=normalized_score, fill=model))+ 
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(alpha_model_sum$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(alpha_model_sum$model)] )+
  scale_x_discrete(name="Model Type", labels= NULL)+
  ylab("Normalized Rank")+
  # guides(fill = "none") +
  theme_classic()


MAE_heatmap_alpha<-alpha_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index),  fill=model))+
  geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
  scale_fill_manual(name = "Model:", labels=gglabels_MAE[names(MAE_map_col) %in% unique(alpha_daily_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(alpha_daily_winner$model)] )+
  scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
  ylab("County")+
  # labs(fill = "Model:") +
  theme_classic()+ 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+  ggtitle("Best daily performing model by county as measured by MAE")

participants_heatmap_alpha<-alpha_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index),  fill=as.factor(models_per_day)))+
  geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
  scale_fill_manual(name = "Number of/Available Forecasts",  values=num_forecasts_col)+
  scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
  ylab("County")+
  guides(fill="none")+  
  theme_classic()+ 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+  ggtitle("Best daily performing model by county as measured by MAE")


MAE_rankings_alpha<-alpha_ranks %>% left_join(regions, by= "county") %>%  group_by(forecaster) %>%
  mutate(med = median(fraction_rank), avg= mean(fraction_rank)) %>% ggplot(aes(x=fraction_rank, fill=as.factor(forecaster))) + geom_density() + 
  geom_vline(aes(xintercept = med, group = forecaster), colour = 'black', lty="dashed")+ 
  geom_vline(aes(xintercept = avg, group = forecaster), colour = 'black', lty="solid")+ 
  facet_grid(forecaster ~.)+
  scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(alpha_ranks$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(alpha_ranks$forecaster)] ) + theme_classic()+ xlab("Fractional Ranking")+ ylab("Density")+ guides(fill="none")+ theme(strip.text = element_blank())

col2<- cowplot::plot_grid(MAE_map_alpha, MAE_rankings_alpha, labels=c("B", "C"), ncol=1, align="hv", axis = "l")
cowplot::plot_grid(MAE_heatmap_alpha, col2, ncol=2, labels=c("A", ""), axis = "l", rel_widths = c(2.5,1))

alpha_winner %>% ungroup() %>% count(model) %>% kable
model n
CA-baseline 33
columbia 3
lemma 7
m.proj 8
simple 5
alpha_ranks %>% left_join(regions, by= "county") %>%  group_by(forecaster) %>%
  mutate(med = median(fraction_rank), avg= mean(fraction_rank)) %>% select(forecaster, med, avg) %>% unique() %>% kable()
forecaster med avg
LEMMA 0.6000000 0.6582263
Columbia 0.2000000 0.3995694
Simple Growth 0.6000000 0.6149167
Ensemble 0.6000000 0.5877652
COVID Nearterm 0.6666667 0.7301235
CA-baseline 0.8000000 0.8021667

Whole Delta period: Plot maps of best performing models by county for

MAE_map_delta<-delta_winner %>% left_join(county_fips, by="county") %>%
  left_join(urbnmapr::counties, by = "county_fips") %>% 
  filter(state_name =="California") %>% 
  ggplot(mapping = aes(long, lat, group = group, fill = model)) +
  geom_polygon(color = "#ffffff", size = .25) +
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(delta_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(delta_winner$model)] )+
  coord_map(projection = "albers", lat0 = 39, lat1 = 45) +
  guides(fill = "none") +
  theme_classic()+
  xlab(NULL)+ ylab(NULL)+
    theme(axis.line = element_blank(),
    plot.background = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank())


#version 1- histogram of winning models by county
MAE_hist<-delta_winner  %>% ggplot(aes(model)) + geom_histogram(stat="count", aes(fill=model))+ 
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(delta_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(delta_winner$model)] )+
  scale_x_discrete(name="Model Type", labels= NULL)+
  ylab("Count")+
  guides(fill = "none") +
  theme_classic()

#version 2- histogram of normalized ranking scores
MAE_hist<-delta_model_sum  %>% ggplot(aes(model)) + geom_col(aes(x=model, y=normalized_score, fill=model))+ 
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(delta_model_sum$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(delta_model_sum$model)] )+
  scale_x_discrete(name="Model Type", labels= NULL)+
  ylab("Normalized Rank")+
  guides(fill = "none") +
  theme_classic()


MAE_heatmap_delta<-delta_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index),  fill=model))+
  geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
  scale_fill_manual(name = "Model:", labels=gglabels_MAE[names(MAE_map_col) %in% unique(delta_daily_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(delta_daily_winner$model)] )+
  scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
  ylab("County")+
  labs(fill = "Model:") +
  theme_classic()+ 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+  ggtitle("Best daily performing model by county as measured by MAE")

participants_heatmap_delta<-delta_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index),  fill=as.factor(models_per_day)))+
  geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
  scale_fill_manual(name = "Number of/Available Forecasts",  values=num_forecasts_col)+
  scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
  ylab("County")+
  labs(fill = "Model:") +
  theme_classic()+ 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+  ggtitle("Best daily performing model by county as measured by MAE")


MAE_rankings_delta<-delta_ranks %>% left_join(regions, by= "county") %>%  group_by(forecaster) %>%
  mutate(med = median(fraction_rank), avg= mean(fraction_rank)) %>% ggplot(aes(x=fraction_rank, fill=as.factor(forecaster))) + geom_density() + 
  geom_vline(aes(xintercept = med, group = forecaster), colour = 'black', lty="dashed")+ 
  geom_vline(aes(xintercept = avg, group = forecaster), colour = 'black', lty="solid")+ 
  facet_grid(forecaster ~.)+
  scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(alpha_ranks$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(alpha_ranks$forecaster)] ) + theme_classic()+ xlab("Fractional Ranking")+ ylab("Density")+ guides(fill="none")+ theme(strip.text = element_blank())


col2<- cowplot::plot_grid(MAE_map_delta, MAE_rankings_delta, labels=c("B", "C"), ncol=1, align="hv", axis = "l")
cowplot::plot_grid(MAE_heatmap_delta, col2, ncol=2, labels=c("A", ""), axis = "l", rel_widths = c(2.5,1))

delta_winner %>% ungroup() %>% count(model) %>% kable
model n
CA-baseline 11
columbia 2
covid_nearterm 16
lemma 7
m.proj 13
simple 6
delta_ranks %>% left_join(regions, by= "county") %>%  group_by(forecaster) %>%
  mutate(med = median(fraction_rank), avg= mean(fraction_rank)) %>% select(forecaster, med, avg) %>% unique() %>% kable()
forecaster med avg
LEMMA 0.6000000 0.6046805
Columbia 0.2500000 0.4140758
COVID Nearterm 0.8333333 0.7461538
Simple Growth 0.6000000 0.5923356
Ensemble 0.6666667 0.6465188
CA-baseline 0.6666667 0.6640650

Omicron: Plot maps of best performing models by count

MAE_map_omicron<-omicron_winner %>% left_join(county_fips, by="county") %>%
  left_join(urbnmapr::counties, by = "county_fips") %>% 
  filter(state_name =="California") %>% 
  ggplot(mapping = aes(long, lat, group = group, fill = model)) +
  geom_polygon(color = "#ffffff", size = .25) +
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(omicron_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(omicron_winner$model)] )+
  coord_map(projection = "albers", lat0 = 39, lat1 = 45) +
  guides(fill = "none") +
  theme_classic()+
  xlab(NULL)+ ylab(NULL)+
    theme(axis.line = element_blank(),
    plot.background = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank())


#version 1- histogram of winning models by county
MAE_hist<-omicron_winner  %>% ggplot(aes(model)) + geom_histogram(stat="count", aes(fill=model))+ 
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(omicron_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(omicron_winner$model)] )+
  scale_x_discrete(name="Model Type", labels= NULL)+
  ylab("Count")+
  guides(fill = "none") +
  theme_classic()

#version 2- histogram of normalized ranking scores
MAE_hist<-omicron_model_sum  %>% ggplot(aes(model)) + geom_col(aes(x=model, y=normalized_score, fill=model))+ 
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(omicron_model_sum$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(omicron_model_sum$model)] )+
  scale_x_discrete(name="Model Type", labels= NULL)+
  ylab("Normalized Rank")+
  guides(fill = "none") +
  theme_classic()

MAE_heatmap_omicron<-omicron_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index),  fill=model))+
  geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
  scale_fill_manual(name = "Model:", labels=gglabels_MAE[names(MAE_map_col) %in% unique(omicron_daily_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(omicron_daily_winner$model)] )+
  scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
  ylab("County")+
  guides(fill=guide_legend(title.position="left", nrow=1))+
  theme_classic()+ 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+  ggtitle("Best daily performing model by county as measured by MAE")

participants_heatmap_omicron <-omicron_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index),  fill=as.factor(models_per_day)))+
  geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
  scale_fill_manual(name = "Number of Available Forecasts",  values=num_forecasts_col)+
  scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
  ylab("County")+
  guides(fill=guide_legend(title.position="left", nrow=2))+
  theme_classic()+ 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+  ggtitle("Best daily performing model by county as measured by MAE")


MAE_rankings_omicron<-omicron_ranks %>% left_join(regions, by= "county") %>%  group_by(forecaster) %>% drop_na() %>%
  mutate(med = median(fraction_rank, na.rm=TRUE), avg= mean(fraction_rank, na.rm=TRUE)) %>% ggplot(aes(x=fraction_rank, fill=as.factor(forecaster))) + geom_density() + 
  geom_vline(aes(xintercept = med, group = forecaster), colour = 'black', lty="dashed")+ 
  geom_vline(aes(xintercept = avg, group = forecaster), colour = 'black', lty="solid")+ 
  facet_grid(forecaster ~., scale="free")+
  scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(omicron_ranks$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(omicron_ranks$forecaster)] ) + theme_classic()+ xlab("Fractional Ranking")+ ylab("Density")+ 
  guides(fill="none")+ 
  theme(strip.text = element_blank())

# row1<-cowplot::plot_grid(participants_heatmap_omicron, MAE_heatmap_omicron, labels=c("A", "B"), ncol=2, align="hv", axis = "tb")
# row2<-cowplot::plot_grid(MAE_map_omicron, MAE_rankings_omicron, labels=c("C", "D"), ncol=2, align="hv", axis = "l")
# cowplot::plot_grid(row1, row2, ncol=1, align="hv", axis= "tb", rel_heights = c(2.5,1))
# cowplot::plot_grid(MAE_map_omicron, MAE_hist, ncol=2, axis = "l")

col2<- cowplot::plot_grid(MAE_map_omicron, MAE_rankings_omicron, labels=c("B", "C"), ncol=1, align="hv", axis = "l")
cowplot::plot_grid(MAE_heatmap_omicron, col2, ncol=2, labels=c("A", ""), axis = "l", rel_widths = c(2.5,1))

# cowplot::plot_grid(participants_heatmap_omicron, MAE_heatmap_omicron, col2, ncol=3, labels=c("A", "B", ""), axis = "l", rel_widths = c(1,1, .5))

omicron_winner %>% ungroup() %>% count(model) %>% kable
model n
CA-baseline 17
covid_nearterm 14
m.proj 11
simple 13
omicron_ranks %>% left_join(regions, by= "county") %>%  group_by(forecaster) %>%
  mutate(med = median(fraction_rank), avg= mean(fraction_rank)) %>% select(forecaster, med, avg) %>% unique() %>% kable()
forecaster med avg
Columbia NA NA
COVID Nearterm 0.7500000 0.7224925
Simple Growth 0.6666667 0.5869519
Ensemble 0.5000000 0.5987611
LEMMA 0.8000000 0.7377133
CA-baseline 0.7500000 0.6886453

Number of forecasts participants by date

participants_heatmap_alpha<-alpha_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index),  fill=as.factor(models_per_day)))+
  geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
  scale_fill_manual(name = "Number of Available Forecasts",  values=num_forecasts_col)+
  scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
  ylab("County")+
  #guides(fill="none")+  
  theme_classic()+ 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+  ggtitle("Best daily performing model by county as measured by MAE")

legend <- cowplot::get_legend(
  # create some space to the left of the legend
  participants_heatmap_alpha
)

participants_heatmap_alpha<- participants_heatmap_alpha+ guides(fill="none")

participants_heatmap_delta<-delta_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index),  fill=as.factor(models_per_day)))+
  geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
  scale_fill_manual(name = "Number of/Available Forecasts",  values=num_forecasts_col)+
  scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
  ylab("")+
  guides(fill="none")+  
  theme_classic()+ 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+  ggtitle("Best daily performing model by county as measured by MAE")


participants_heatmap_omicron <-omicron_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index),  fill=as.factor(models_per_day)))+
  geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
  scale_fill_manual(name = "Number of Available Forecasts",  values=num_forecasts_col)+
  scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
  ylab("")+
  # guides(fill=guide_legend(title.position="left", nrow=2))+
   guides(fill="none")+  
  theme_classic()+ 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "right")#+  ggtitle("Best daily performing model by county as measured by MAE")


# cowplot::plot_grid(participants_heatmap_alpha, participants_heatmap_delta, participants_heatmap_omicron, labels="AUTO", ncol=3, align="hv", axis = "tb")
row1<- cowplot::plot_grid(participants_heatmap_alpha, participants_heatmap_delta, participants_heatmap_omicron, labels="AUTO", ncol=3, align="hv", axis = "tb")
cowplot::plot_grid(row1, legend, ncol=1, rel_heights=c(3,0.5))

Train Random Forest Classification

## Random Forest 
## 
## 14336 samples
##    13 predictor
##     6 classes: 'simple', 'lemma', 'CA-baseline', 'm.proj', 'covid_nearterm', 'columbia' 
## 
## Pre-processing: centered (13), scaled (13) 
## Resampling: Cross-Validated (4 fold, repeated 4 times) 
## Summary of sample sizes: 10753, 10752, 10752, 10751, 10752, 10752, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.5936809  0.4791519
##    7    0.6082419  0.5010488
##   13    0.6032020  0.4946206
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 7.

## rf variable importance
## 
##               Overall
## perc_COVIDvx  100.000
## r_eff          76.989
## diff           72.680
## delta_frac     36.048
## pop_size       33.101
## other_frac     26.019
## alpha_frac     23.536
## gamma_frac     15.309
## omicron_frac    6.161
## income          3.624
## perc_unemploy   1.937
## education       1.583
## perc_pov        0.000

14 days

Score model performance using fractional ranking

#Update MAE definitions here as needed (MAE_7day, MAE_14day, MAE_21day, MAE_28day) and filter by appropriate ahead/delay (e.g., 7, 14, 21, 28)
delay<-14
MAE_ranks_long <- score_cards_tourney %>% filter(ahead <=delay) %>% group_by(county, index) %>% select(index, county, model, forecaster, MAE=MAE_14day, norm_MAE=norm_MAE_14day)  %>% distinct() %>% mutate(MAE_rank=dense_rank(desc(-MAE)), norm_MAE_rank=dense_rank(desc(-norm_MAE)))

max_models<-MAE_ranks_long %>% group_by(county, index)  %>% summarize(models_per_day=max(norm_MAE_rank, na.rm=TRUE))

fractional_ranks<- MAE_ranks_long %>% left_join(max_models, by=c("county", "index")) %>% mutate(fraction_rank= 1- (norm_MAE_rank-1)/models_per_day)
daily_winner<-fractional_ranks %>% group_by(county, index) %>% top_n(n=-1, norm_MAE_rank) %>% select(-norm_MAE_rank)
county_sums<-fractional_ranks %>% group_by(county, model) %>% summarize(county_sum= sum(fraction_rank, na.rm=TRUE))
county_sums_wide<-county_sums %>% pivot_wider(names_from="model", values_from = "county_sum")

alpha_ranks <- fractional_ranks %>% filter(index > as.Date(alpha_start) & index < as.Date(alpha_end))
alpha_daily_winner<-alpha_ranks %>% group_by(county, index) %>% top_n(n=-1, norm_MAE_rank) %>% select(-norm_MAE_rank)
alpha_county_by_model <-alpha_ranks %>% group_by(county, model) %>% summarize(sum_fraction=sum(fraction_rank, na.rm=TRUE))
alpha_winner<- alpha_county_by_model %>% top_n(n=1, sum_fraction) %>% select(-sum_fraction)
alpha_model_count<- alpha_ranks %>% na.omit() %>% group_by(model)%>% summarize(model_count=n())
alpha_date_range<-length(unique(alpha_ranks$index))
num_counties<-length(unique(alpha_ranks$county))
alpha_model_sum<-alpha_ranks %>% group_by(model) %>% summarize(fraction_rank=sum(fraction_rank, na.rm=TRUE)) %>% left_join(alpha_model_count, by="model") %>% 
mutate(normalized_score=fraction_rank/(model_count)) %>% filter(normalized_score>0)


delta_ranks <- fractional_ranks %>% filter(index > as.Date(delta_start) & index < as.Date(delta_end))
delta_daily_winner<-delta_ranks %>% group_by(county, index) %>% top_n(n=-1, norm_MAE_rank) %>% select(-norm_MAE_rank)
delta_county_by_model <-delta_ranks %>% group_by(county, model) %>% summarize(sum_fraction=sum(fraction_rank, na.rm=TRUE))
delta_winner<- delta_county_by_model %>% top_n(n=1, sum_fraction) %>% select(-sum_fraction)
delta_model_count<- delta_ranks %>% na.omit() %>% group_by(model)%>% summarize(model_count=n())
delta_date_range<-length(unique(delta_ranks$index))
num_counties<-length(unique(delta_ranks$county))
delta_model_sum<-delta_ranks %>% group_by(model) %>% summarize(fraction_rank=sum(fraction_rank, na.rm=TRUE)) %>% left_join(delta_model_count, by="model") %>% 
mutate(normalized_score=fraction_rank/(model_count)) %>% filter(normalized_score>0)



omicron_ranks <- fractional_ranks %>% filter(index > as.Date(omicron_start) & index < as.Date(omicron_end-delay))
omicron_daily_winner<-omicron_ranks %>% group_by(county, index) %>% top_n(n=-1, norm_MAE_rank) %>% select(-norm_MAE_rank)
omicron_county_by_model <-omicron_ranks %>% group_by(county, model) %>% summarize(sum_fraction=sum(fraction_rank, na.rm=TRUE))
omicron_winner<- omicron_county_by_model %>% top_n(n=1, sum_fraction) %>% select(-sum_fraction)
omicron_model_count<- omicron_ranks %>% na.omit() %>% group_by(model)%>% summarize(model_count=n())
omicron_date_range<-length(unique(omicron_ranks$index))
num_counties<-length(unique(omicron_ranks$county))
omicron_model_sum<-omicron_ranks %>% group_by(model) %>% summarize(fraction_rank=sum(fraction_rank, na.rm=TRUE)) %>% left_join(omicron_model_count, by="model") %>% 
mutate(normalized_score=fraction_rank/(model_count)) %>% filter(normalized_score>0)

Alpha period: Plot maps of best performing models by county

MAE_map_alpha<-alpha_winner %>% left_join(county_fips, by="county") %>%
  left_join(urbnmapr::counties, by = "county_fips") %>% 
  filter(state_name =="California") %>% 
  ggplot(mapping = aes(long, lat, group = group, fill = model)) +
  geom_polygon(color = "#ffffff", size = .25) +
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(alpha_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(alpha_winner$model)] )+
  coord_map(projection = "albers", lat0 = 39, lat1 = 45) +
  guides(fill = "none") +
  theme_classic()+
  xlab(NULL)+ ylab(NULL)+
    theme(axis.line = element_blank(),
    plot.background = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank())


#version 1- histogram of winning models by county
MAE_hist_v2<-alpha_winner  %>% ggplot(aes(model)) + geom_histogram(stat="count", aes(fill=model))+ 
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(alpha_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(alpha_winner$model)] )+
  scale_x_discrete(name="Model Type", labels= NULL)+
  ylab("Count")+
  # guides(fill = "none") +
  theme_classic()

#version 2- histogram of normalized ranking scores
MAE_hist<-alpha_model_sum  %>% ggplot(aes(model)) + geom_col(aes(x=model, y=normalized_score, fill=model))+ 
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(alpha_model_sum$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(alpha_model_sum$model)] )+
  scale_x_discrete(name="Model Type", labels= NULL)+
  ylab("Normalized Rank")+
  # guides(fill = "none") +
  theme_classic()


MAE_heatmap_alpha<-alpha_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index),  fill=model))+
  geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
  scale_fill_manual(name = "Model:", labels=gglabels_MAE[names(MAE_map_col) %in% unique(alpha_daily_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(alpha_daily_winner$model)] )+
  scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
  ylab("County")+
  labs(fill = "Model:") +
  theme_classic()+ 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+  ggtitle("Best daily performing model by county as measured by MAE")

participants_heatmap_alpha<-alpha_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index),  fill=as.factor(models_per_day)))+
  geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
  scale_fill_manual(name = "Number of/Available Forecasts",  values=num_forecasts_col)+
  scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
  ylab("County")+
  guides(fill="none")+  
  theme_classic()+ 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+  ggtitle("Best daily performing model by county as measured by MAE")


MAE_rankings_alpha<-alpha_ranks %>% left_join(regions, by= "county") %>%  group_by(forecaster) %>%
  mutate(med = median(fraction_rank), avg= mean(fraction_rank)) %>% ggplot(aes(x=fraction_rank, fill=as.factor(forecaster))) + geom_density() + 
  geom_vline(aes(xintercept = med, group = forecaster), colour = 'black', lty="dashed")+ 
  geom_vline(aes(xintercept = avg, group = forecaster), colour = 'black', lty="solid")+ 
  facet_grid(forecaster ~.)+
  scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(alpha_ranks$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(alpha_ranks$forecaster)] ) + theme_classic()+ xlab("Fractional Ranking")+ ylab("Density")+ guides(fill="none")+ theme(strip.text = element_blank())


#cowplot::plot_grid(MAE_map, MAE_hist, ncol=2, axis = "l")
# cowplot::plot_grid(map14, day14_hist, ncol=2, axis = "l")
# cowplot::plot_grid(map28, day28_hist, ncol=2, axis = "l")

col2<- cowplot::plot_grid(MAE_map_alpha, MAE_rankings_alpha, labels=c("B", "C"), ncol=1, align="hv", axis = "l")
cowplot::plot_grid(MAE_heatmap_alpha, col2, ncol=2, labels=c("A", ""), axis = "l", rel_widths = c(2.5,1))

alpha_winner %>% ungroup() %>% count(model) %>% kable
model n
CA-baseline 30
columbia 4
lemma 9
m.proj 8
simple 4
alpha_ranks %>% left_join(regions, by= "county") %>%  group_by(forecaster) %>%
  mutate(med = median(fraction_rank), avg= mean(fraction_rank)) %>% select(forecaster, med, avg) %>% unique() %>% kable()
forecaster med avg
LEMMA 0.6000000 0.6634190
Columbia 0.2000000 0.4125040
Simple Growth 0.6000000 0.5912348
Ensemble 0.6000000 0.6119545
COVID Nearterm 0.6666667 0.6965432
CA-baseline 0.8000000 0.7869697

Whole Delta period: Plot maps of best performing models by county for

MAE_map_delta<-delta_winner %>% left_join(county_fips, by="county") %>%
  left_join(urbnmapr::counties, by = "county_fips") %>% 
  filter(state_name =="California") %>% 
  ggplot(mapping = aes(long, lat, group = group, fill = model)) +
  geom_polygon(color = "#ffffff", size = .25) +
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(delta_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(delta_winner$model)] )+
  coord_map(projection = "albers", lat0 = 39, lat1 = 45) +
  guides(fill = "none") +
  theme_classic()+
  xlab(NULL)+ ylab(NULL)+
    theme(axis.line = element_blank(),
    plot.background = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank())

#version 1- histogram of winning models by county
MAE_hist<-delta_winner  %>% ggplot(aes(model)) + geom_histogram(stat="count", aes(fill=model))+ 
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(delta_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(delta_winner$model)] )+
  scale_x_discrete(name="Model Type", labels= NULL)+
  ylab("Count")+
  guides(fill = "none") +
  theme_classic()

#version 2- histogram of normalized ranking scores
MAE_hist<-delta_model_sum  %>% ggplot(aes(model)) + geom_col(aes(x=model, y=normalized_score, fill=model))+ 
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(delta_model_sum$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(delta_model_sum$model)] )+
  scale_x_discrete(name="Model Type", labels= NULL)+
  ylab("Normalized Rank")+
  guides(fill = "none") +
  theme_classic()


MAE_heatmap_delta<-delta_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index),  fill=model))+
  geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
  scale_fill_manual(name = "Model:", labels=gglabels_MAE[names(MAE_map_col) %in% unique(delta_daily_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(delta_daily_winner$model)] )+
  scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
  ylab("County")+
  labs(fill = "Model:") +
  theme_classic()+ 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+  ggtitle("Best daily performing model by county as measured by MAE")

participants_heatmap_delta<-delta_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index),  fill=as.factor(models_per_day)))+
  geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
  scale_fill_manual(name = "Number of/Available Forecasts",  values=num_forecasts_col)+
  scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
  ylab("County")+
  labs(fill = "Model:") +
  theme_classic()+ 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+  ggtitle("Best daily performing model by county as measured by MAE")


MAE_rankings_delta<-delta_ranks %>% left_join(regions, by= "county") %>%  group_by(forecaster) %>%
  mutate(med = median(fraction_rank), avg= mean(fraction_rank)) %>% ggplot(aes(x=fraction_rank, fill=as.factor(forecaster))) + geom_density() + 
  geom_vline(aes(xintercept = med, group = forecaster), colour = 'black', lty="dashed")+ 
  geom_vline(aes(xintercept = avg, group = forecaster), colour = 'black', lty="solid")+ 
  facet_grid(forecaster ~.)+
  scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(alpha_ranks$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(alpha_ranks$forecaster)] ) + theme_classic()+ xlab("Fractional Ranking")+ ylab("Density")+ guides(fill="none")+ theme(strip.text = element_blank())


col2<- cowplot::plot_grid(MAE_map_delta, MAE_rankings_delta, labels=c("B", "C"), ncol=1, align="hv", axis = "l")
cowplot::plot_grid(MAE_heatmap_delta, col2, ncol=2, labels=c("A", ""), axis = "l", rel_widths = c(2.5,1))

delta_winner %>% ungroup() %>% count(model) %>% kable
model n
CA-baseline 12
columbia 2
covid_nearterm 6
lemma 10
m.proj 15
simple 10
delta_ranks %>% left_join(regions, by= "county") %>%  group_by(forecaster) %>%
  mutate(med = median(fraction_rank), avg= mean(fraction_rank)) %>% select(forecaster, med, avg) %>% unique() %>% kable()
forecaster med avg
LEMMA 0.6666667 0.6319085
Columbia 0.3333333 0.4152576
COVID Nearterm 0.6666667 0.6815510
Simple Growth 0.6000000 0.5900665
Ensemble 0.6666667 0.6645528
CA-baseline 0.6666667 0.6488544

Omicron: Plot maps of best performing models by count

MAE_map_omicron<-omicron_winner %>% left_join(county_fips, by="county") %>%
  left_join(urbnmapr::counties, by = "county_fips") %>% 
  filter(state_name =="California") %>% 
  ggplot(mapping = aes(long, lat, group = group, fill = model)) +
  geom_polygon(color = "#ffffff", size = .25) +
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(omicron_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(omicron_winner$model)] )+
  coord_map(projection = "albers", lat0 = 39, lat1 = 45) +
  guides(fill = "none") +
  theme_classic()+
  xlab(NULL)+ ylab(NULL)+
    theme(axis.line = element_blank(),
    plot.background = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank())


#version 1- histogram of winning models by county
MAE_hist<-omicron_winner  %>% ggplot(aes(model)) + geom_histogram(stat="count", aes(fill=model))+ 
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(omicron_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(omicron_winner$model)] )+
  scale_x_discrete(name="Model Type", labels= NULL)+
  ylab("Count")+
  guides(fill = "none") +
  theme_classic()

#version 2- histogram of normalized ranking scores
MAE_hist<-omicron_model_sum  %>% ggplot(aes(model)) + geom_col(aes(x=model, y=normalized_score, fill=model))+ 
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(omicron_model_sum$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(omicron_model_sum$model)] )+
  scale_x_discrete(name="Model Type", labels= NULL)+
  ylab("Normalized Rank")+
  guides(fill = "none") +
  theme_classic()

MAE_heatmap_omicron<-omicron_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index),  fill=model))+
  geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
  scale_fill_manual(name = "Model:", labels=gglabels_MAE[names(MAE_map_col) %in% unique(omicron_daily_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(omicron_daily_winner$model)] )+
  scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
  ylab("County")+
  guides(fill=guide_legend(title.position="left", nrow=1))+
  theme_classic()+ 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+  ggtitle("Best daily performing model by county as measured by MAE")

participants_heatmap_omicron <-omicron_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index),  fill=as.factor(models_per_day)))+
  geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
  scale_fill_manual(name = "Number of Available Forecasts",  values=num_forecasts_col)+
  scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
  ylab("County")+
  guides(fill=guide_legend(title.position="left", nrow=2))+
  theme_classic()+ 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+  ggtitle("Best daily performing model by county as measured by MAE")


MAE_rankings_omicron<-omicron_ranks %>% left_join(regions, by= "county") %>%  group_by(forecaster) %>% drop_na() %>%
  mutate(med = median(fraction_rank, na.rm=TRUE), avg= mean(fraction_rank, na.rm=TRUE)) %>% ggplot(aes(x=fraction_rank, fill=as.factor(forecaster))) + geom_density() + 
  geom_vline(aes(xintercept = med, group = forecaster), colour = 'black', lty="dashed")+ 
  geom_vline(aes(xintercept = avg, group = forecaster), colour = 'black', lty="solid")+ 
  facet_grid(forecaster ~., scale="free")+
  scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(omicron_ranks$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(omicron_ranks$forecaster)] ) + theme_classic()+ xlab("Fractional Ranking")+ ylab("Density")+ 
  guides(fill="none")+ 
  theme(strip.text = element_blank())

# row1<-cowplot::plot_grid(participants_heatmap_omicron, MAE_heatmap_omicron, labels=c("A", "B"), ncol=2, align="hv", axis = "tb")
# row2<-cowplot::plot_grid(MAE_map_omicron, MAE_rankings_omicron, labels=c("C", "D"), ncol=2, align="hv", axis = "l")
# cowplot::plot_grid(row1, row2, ncol=1, align="hv", axis= "tb", rel_heights = c(2.5,1))
# cowplot::plot_grid(MAE_map_omicron, MAE_hist, ncol=2, axis = "l")

col2<- cowplot::plot_grid(MAE_map_omicron, MAE_rankings_omicron, labels=c("B", "C"), ncol=1, align="hv", axis = "l")
cowplot::plot_grid(MAE_heatmap_omicron, col2, ncol=2, labels=c("A", ""), axis = "l", rel_widths = c(2.5,1))

# cowplot::plot_grid(participants_heatmap_omicron, MAE_heatmap_omicron, col2, ncol=3, labels=c("A", "B", ""), axis = "l", rel_widths = c(1,1, .5))

omicron_winner %>% ungroup() %>% count(model) %>% kable
model n
CA-baseline 17
covid_nearterm 15
m.proj 17
simple 6
omicron_ranks %>% left_join(regions, by= "county") %>%  group_by(forecaster) %>%
  mutate(med = median(fraction_rank), avg= mean(fraction_rank)) %>% select(forecaster, med, avg) %>% unique() %>% kable()
forecaster med avg
Columbia NA NA
COVID Nearterm 0.7500000 0.7259497
Simple Growth 0.6000000 0.5638047
Ensemble 0.6666667 0.6279349
LEMMA 0.8000000 0.7257073
CA-baseline 0.6666667 0.6711672

Number of forecasts participants by date

participants_heatmap_alpha<-alpha_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index),  fill=as.factor(models_per_day)))+
  geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
  scale_fill_manual(name = "Number of Available Forecasts",  values=num_forecasts_col)+
  scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
  ylab("County")+
  #guides(fill="none")+  
  theme_classic()+ 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+  ggtitle("Best daily performing model by county as measured by MAE")

legend <- cowplot::get_legend(
  # create some space to the left of the legend
  participants_heatmap_alpha
)

participants_heatmap_alpha<- participants_heatmap_alpha+ guides(fill="none")

participants_heatmap_delta<-delta_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index),  fill=as.factor(models_per_day)))+
  geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
  scale_fill_manual(name = "Number of/Available Forecasts",  values=num_forecasts_col)+
  scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
  ylab("")+
  guides(fill="none")+  
  theme_classic()+ 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+  ggtitle("Best daily performing model by county as measured by MAE")


participants_heatmap_omicron <-omicron_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index),  fill=as.factor(models_per_day)))+
  geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
  scale_fill_manual(name = "Number of Available Forecasts",  values=num_forecasts_col)+
  scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
  ylab("")+
  # guides(fill=guide_legend(title.position="left", nrow=2))+
   guides(fill="none")+  
  theme_classic()+ 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "right")#+  ggtitle("Best daily performing model by county as measured by MAE")


# cowplot::plot_grid(participants_heatmap_alpha, participants_heatmap_delta, participants_heatmap_omicron, labels="AUTO", ncol=3, align="hv", axis = "tb")
row1<- cowplot::plot_grid(participants_heatmap_alpha, participants_heatmap_delta, participants_heatmap_omicron, labels="AUTO", ncol=3, align="hv", axis = "tb")
cowplot::plot_grid(row1, legend, ncol=1, rel_heights=c(3,0.5))

Train Random Forest Classification

## Random Forest 
## 
## 14041 samples
##    13 predictor
##     6 classes: 'simple', 'lemma', 'CA-baseline', 'm.proj', 'covid_nearterm', 'columbia' 
## 
## Pre-processing: centered (13), scaled (13) 
## Resampling: Cross-Validated (4 fold, repeated 4 times) 
## Summary of sample sizes: 10531, 10531, 10531, 10530, 10532, 10532, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.6497753  0.5517726
##    7    0.6630225  0.5713979
##   13    0.6559360  0.5622639
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 7.

## rf variable importance
## 
##                Overall
## perc_COVIDvx  100.0000
## r_eff          73.8465
## diff           65.4926
## pop_size       34.3191
## delta_frac     30.2233
## other_frac     21.5257
## alpha_frac     21.4440
## gamma_frac     11.5425
## income          2.6647
## perc_unemploy   1.5299
## education       1.3116
## omicron_frac    0.5795
## perc_pov        0.0000

21 days

Score model performance using fractional ranking

#Update MAE definitions here as needed (MAE_7day, MAE_14day, MAE_21day, MAE_28day) and filter by appropriate ahead/delay (e.g., 7, 14, 21, 28)
delay<-21
MAE_ranks_long <- score_cards_tourney %>% filter(ahead <=delay) %>% group_by(county, index) %>% select(index, county, model, forecaster, MAE=MAE_21day, norm_MAE=norm_MAE_21day)  %>% distinct() %>% mutate(MAE_rank=dense_rank(desc(-MAE)), norm_MAE_rank=dense_rank(desc(-norm_MAE)))

max_models<-MAE_ranks_long %>% group_by(county, index)  %>% summarize(models_per_day=max(norm_MAE_rank, na.rm=TRUE))

fractional_ranks<- MAE_ranks_long %>% left_join(max_models, by=c("county", "index")) %>% mutate(fraction_rank= 1- (norm_MAE_rank-1)/models_per_day)
daily_winner<-fractional_ranks %>% group_by(county, index) %>% top_n(n=-1, norm_MAE_rank) %>% select(-norm_MAE_rank)
county_sums<-fractional_ranks %>% group_by(county, model) %>% summarize(county_sum= sum(fraction_rank, na.rm=TRUE))
county_sums_wide<-county_sums %>% pivot_wider(names_from="model", values_from = "county_sum")

alpha_ranks <- fractional_ranks %>% filter(index > as.Date(alpha_start) & index < as.Date(alpha_end))
alpha_daily_winner<-alpha_ranks %>% group_by(county, index) %>% top_n(n=-1, norm_MAE_rank) %>% select(-norm_MAE_rank)
alpha_county_by_model <-alpha_ranks %>% group_by(county, model) %>% summarize(sum_fraction=sum(fraction_rank, na.rm=TRUE))
alpha_winner<- alpha_county_by_model %>% top_n(n=1, sum_fraction) %>% select(-sum_fraction)
alpha_model_count<- alpha_ranks %>% na.omit() %>% group_by(model)%>% summarize(model_count=n())
alpha_date_range<-length(unique(alpha_ranks$index))
num_counties<-length(unique(alpha_ranks$county))
alpha_model_sum<-alpha_ranks %>% group_by(model) %>% summarize(fraction_rank=sum(fraction_rank, na.rm=TRUE)) %>% left_join(alpha_model_count, by="model") %>% 
mutate(normalized_score=fraction_rank/(model_count)) %>% filter(normalized_score>0)


delta_ranks <- fractional_ranks %>% filter(index > as.Date(delta_start) & index < as.Date(delta_end))
delta_daily_winner<-delta_ranks %>% group_by(county, index) %>% top_n(n=-1, norm_MAE_rank) %>% select(-norm_MAE_rank)
delta_county_by_model <-delta_ranks %>% group_by(county, model) %>% summarize(sum_fraction=sum(fraction_rank, na.rm=TRUE))
delta_winner<- delta_county_by_model %>% top_n(n=1, sum_fraction) %>% select(-sum_fraction)
delta_model_count<- delta_ranks %>% na.omit() %>% group_by(model)%>% summarize(model_count=n())
delta_date_range<-length(unique(delta_ranks$index))
num_counties<-length(unique(delta_ranks$county))
delta_model_sum<-delta_ranks %>% group_by(model) %>% summarize(fraction_rank=sum(fraction_rank, na.rm=TRUE)) %>% left_join(delta_model_count, by="model") %>% 
mutate(normalized_score=fraction_rank/(model_count)) %>% filter(normalized_score>0)


omicron_ranks <- fractional_ranks %>% filter(index > as.Date(omicron_start) & index < as.Date(omicron_end-delay))
omicron_daily_winner<-omicron_ranks %>% group_by(county, index) %>% top_n(n=-1, norm_MAE_rank) %>% select(-norm_MAE_rank)
omicron_county_by_model <-omicron_ranks %>% group_by(county, model) %>% summarize(sum_fraction=sum(fraction_rank, na.rm=TRUE))
omicron_winner<- omicron_county_by_model %>% top_n(n=1, sum_fraction) %>% select(-sum_fraction)
omicron_model_count<- omicron_ranks %>% na.omit() %>% group_by(model)%>% summarize(model_count=n())
omicron_date_range<-length(unique(omicron_ranks$index))
num_counties<-length(unique(omicron_ranks$county))
omicron_model_sum<-omicron_ranks %>% group_by(model) %>% summarize(fraction_rank=sum(fraction_rank, na.rm=TRUE)) %>% left_join(omicron_model_count, by="model") %>% 
mutate(normalized_score=fraction_rank/(model_count)) %>% filter(normalized_score>0)

Alpha period: Plot maps of best performing models by county

MAE_map_alpha<-alpha_winner %>% left_join(county_fips, by="county") %>%
  left_join(urbnmapr::counties, by = "county_fips") %>% 
  filter(state_name =="California") %>% 
  ggplot(mapping = aes(long, lat, group = group, fill = model)) +
  geom_polygon(color = "#ffffff", size = .25) +
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(alpha_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(alpha_winner$model)] )+
  coord_map(projection = "albers", lat0 = 39, lat1 = 45) +
  guides(fill = "none") +
  theme_classic()+
  xlab(NULL)+ ylab(NULL)+
    theme(axis.line = element_blank(),
    plot.background = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank())


#version 1- histogram of winning models by county
MAE_hist_v2<-alpha_winner  %>% ggplot(aes(model)) + geom_histogram(stat="count", aes(fill=model))+ 
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(alpha_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(alpha_winner$model)] )+
  scale_x_discrete(name="Model Type", labels= NULL)+
  ylab("Count")+
  # guides(fill = "none") +
  theme_classic()

#version 2- histogram of normalized ranking scores
MAE_hist<-alpha_model_sum  %>% ggplot(aes(model)) + geom_col(aes(x=model, y=normalized_score, fill=model))+ 
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(alpha_model_sum$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(alpha_model_sum$model)] )+
  scale_x_discrete(name="Model Type", labels= NULL)+
  ylab("Normalized Rank")+
  # guides(fill = "none") +
  theme_classic()


MAE_heatmap_alpha<-alpha_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index),  fill=model))+
  geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
  scale_fill_manual(name = "Model:", labels=gglabels_MAE[names(MAE_map_col) %in% unique(alpha_daily_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(alpha_daily_winner$model)] )+
  scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
  ylab("County")+
  labs(fill = "Model:") +
  theme_classic()+ 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+  ggtitle("Best daily performing model by county as measured by MAE")

participants_heatmap_alpha<-alpha_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index),  fill=as.factor(models_per_day)))+
  geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
  scale_fill_manual(name = "Number of/Available Forecasts",  values=num_forecasts_col)+
  scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
  ylab("County")+
  guides(fill="none")+  
  theme_classic()+ 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+  ggtitle("Best daily performing model by county as measured by MAE")


MAE_rankings_alpha<-alpha_ranks %>% left_join(regions, by= "county") %>%  group_by(forecaster) %>%
  mutate(med = median(fraction_rank), avg= mean(fraction_rank)) %>% ggplot(aes(x=fraction_rank, fill=as.factor(forecaster))) + geom_density() + 
  geom_vline(aes(xintercept = med, group = forecaster), colour = 'black', lty="dashed")+ 
  geom_vline(aes(xintercept = avg, group = forecaster), colour = 'black', lty="solid")+ 
  facet_grid(forecaster ~.)+
  scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(alpha_ranks$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(alpha_ranks$forecaster)] ) + theme_classic()+ xlab("Fractional Ranking")+ ylab("Density")+ guides(fill="none")+ theme(strip.text = element_blank())

col2<- cowplot::plot_grid(MAE_map_alpha, MAE_rankings_alpha, labels=c("B", "C"), ncol=1, align="hv", axis = "l")
cowplot::plot_grid(MAE_heatmap_alpha, col2, ncol=2, labels=c("A", ""), axis = "l", rel_widths = c(2.5,1))

alpha_winner %>% ungroup() %>% count(model) %>% kable
model n
CA-baseline 26
columbia 4
lemma 13
m.proj 9
simple 4
alpha_ranks %>% left_join(regions, by= "county") %>%  group_by(forecaster) %>%
  mutate(med = median(fraction_rank), avg= mean(fraction_rank)) %>% select(forecaster, med, avg) %>% unique() %>% kable()
forecaster med avg
LEMMA 0.6000000 0.6595111
Columbia 0.2000000 0.4191308
Simple Growth 0.5000000 0.5830530
Ensemble 0.6000000 0.6278788
COVID Nearterm 0.6666667 0.7081481
CA-baseline 0.8000000 0.7731061

Whole Delta period: Plot maps of best performing models by county for

MAE_map_delta<-delta_winner %>% left_join(county_fips, by="county") %>%
  left_join(urbnmapr::counties, by = "county_fips") %>% 
  filter(state_name =="California") %>% 
  ggplot(mapping = aes(long, lat, group = group, fill = model)) +
  geom_polygon(color = "#ffffff", size = .25) +
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(delta_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(delta_winner$model)] )+
  coord_map(projection = "albers", lat0 = 39, lat1 = 45) +
  guides(fill = "none") +
  theme_classic()+
  xlab(NULL)+ ylab(NULL)+
    theme(axis.line = element_blank(),
    plot.background = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank())


#version 1- histogram of winning models by county
MAE_hist<-delta_winner  %>% ggplot(aes(model)) + geom_histogram(stat="count", aes(fill=model))+ 
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(delta_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(delta_winner$model)] )+
  scale_x_discrete(name="Model Type", labels= NULL)+
  ylab("Count")+
  guides(fill = "none") +
  theme_classic()

#version 2- histogram of normalized ranking scores
MAE_hist<-delta_model_sum  %>% ggplot(aes(model)) + geom_col(aes(x=model, y=normalized_score, fill=model))+ 
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(delta_model_sum$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(delta_model_sum$model)] )+
  scale_x_discrete(name="Model Type", labels= NULL)+
  ylab("Normalized Rank")+
  guides(fill = "none") +
  theme_classic()


MAE_heatmap_delta<-delta_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index),  fill=model))+
  geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
  scale_fill_manual(name = "Model:", labels=gglabels_MAE[names(MAE_map_col) %in% unique(delta_daily_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(delta_daily_winner$model)] )+
  scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
  ylab("County")+
  labs(fill = "Model:") +
  theme_classic()+ 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+  ggtitle("Best daily performing model by county as measured by MAE")

participants_heatmap_delta<-delta_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index),  fill=as.factor(models_per_day)))+
  geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
  scale_fill_manual(name = "Number of/Available Forecasts",  values=num_forecasts_col)+
  scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
  ylab("County")+
  labs(fill = "Model:") +
  theme_classic()+ 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+  ggtitle("Best daily performing model by county as measured by MAE")


MAE_rankings_delta<-delta_ranks %>% left_join(regions, by= "county") %>%  group_by(forecaster) %>%
  mutate(med = median(fraction_rank), avg= mean(fraction_rank)) %>% ggplot(aes(x=fraction_rank, fill=as.factor(forecaster))) + geom_density() + 
  geom_vline(aes(xintercept = med, group = forecaster), colour = 'black', lty="dashed")+ 
  geom_vline(aes(xintercept = avg, group = forecaster), colour = 'black', lty="solid")+ 
  facet_grid(forecaster ~.)+
  scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(alpha_ranks$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(alpha_ranks$forecaster)] ) + theme_classic()+ xlab("Fractional Ranking")+ ylab("Density")+ guides(fill="none")+ theme(strip.text = element_blank())


col2<- cowplot::plot_grid(MAE_map_delta, MAE_rankings_delta, labels=c("B", "C"), ncol=1, align="hv", axis = "l")
cowplot::plot_grid(MAE_heatmap_delta, col2, ncol=2, labels=c("A", ""), axis = "l", rel_widths = c(2.5,1))

delta_winner %>% ungroup() %>% count(model) %>% kable
model n
CA-baseline 14
columbia 2
covid_nearterm 4
lemma 12
m.proj 11
simple 12
delta_ranks %>% left_join(regions, by= "county") %>%  group_by(forecaster) %>%
  mutate(med = median(fraction_rank), avg= mean(fraction_rank)) %>% select(forecaster, med, avg) %>% unique() %>% kable()
forecaster med avg
LEMMA 0.6666667 0.6612781
Columbia 0.3333333 0.4109773
COVID Nearterm 0.6666667 0.6267667
Simple Growth 0.6000000 0.5956837
Ensemble 0.6666667 0.6618108
CA-baseline 0.6666667 0.6488174

Omicron: Plot maps of best performing models by count

MAE_map_omicron<-omicron_winner %>% left_join(county_fips, by="county") %>%
  left_join(urbnmapr::counties, by = "county_fips") %>% 
  filter(state_name =="California") %>% 
  ggplot(mapping = aes(long, lat, group = group, fill = model)) +
  geom_polygon(color = "#ffffff", size = .25) +
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(omicron_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(omicron_winner$model)] )+
  coord_map(projection = "albers", lat0 = 39, lat1 = 45) +
  guides(fill = "none") +
  theme_classic()+
  xlab(NULL)+ ylab(NULL)+
    theme(axis.line = element_blank(),
    plot.background = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank())


#version 1- histogram of winning models by county
MAE_hist<-omicron_winner  %>% ggplot(aes(model)) + geom_histogram(stat="count", aes(fill=model))+ 
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(omicron_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(omicron_winner$model)] )+
  scale_x_discrete(name="Model Type", labels= NULL)+
  ylab("Count")+
  guides(fill = "none") +
  theme_classic()

#version 2- histogram of normalized ranking scores
MAE_hist<-omicron_model_sum  %>% ggplot(aes(model)) + geom_col(aes(x=model, y=normalized_score, fill=model))+ 
scale_fill_manual(name = "Model", labels=gglabels_MAE[names(MAE_map_col) %in% unique(omicron_model_sum$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(omicron_model_sum$model)] )+
  scale_x_discrete(name="Model Type", labels= NULL)+
  ylab("Normalized Rank")+
  guides(fill = "none") +
  theme_classic()

MAE_heatmap_omicron<-omicron_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index),  fill=model))+
  geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
  scale_fill_manual(name = "Model:", labels=gglabels_MAE[names(MAE_map_col) %in% unique(omicron_daily_winner$model)], values=MAE_map_col[names(MAE_map_col) %in% unique(omicron_daily_winner$model)] )+
  scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
  ylab("County")+
  guides(fill=guide_legend(title.position="left", nrow=1))+
  theme_classic()+ 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+  ggtitle("Best daily performing model by county as measured by MAE")

participants_heatmap_omicron <-omicron_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index),  fill=as.factor(models_per_day)))+
  geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
  scale_fill_manual(name = "Number of Available Forecasts",  values=num_forecasts_col)+
  scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
  ylab("County")+
  guides(fill=guide_legend(title.position="left", nrow=2))+
  theme_classic()+ 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+  ggtitle("Best daily performing model by county as measured by MAE")


MAE_rankings_omicron<-omicron_ranks %>% left_join(regions, by= "county") %>%  group_by(forecaster) %>% drop_na() %>%
  mutate(med = median(fraction_rank, na.rm=TRUE), avg= mean(fraction_rank, na.rm=TRUE)) %>% ggplot(aes(x=fraction_rank, fill=as.factor(forecaster))) + geom_density() + 
  geom_vline(aes(xintercept = med, group = forecaster), colour = 'black', lty="dashed")+ 
  geom_vline(aes(xintercept = avg, group = forecaster), colour = 'black', lty="solid")+ 
  facet_grid(forecaster ~., scale="free")+
  scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(omicron_ranks$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(omicron_ranks$forecaster)] ) + theme_classic()+ xlab("Fractional Ranking")+ ylab("Density")+ 
  guides(fill="none")+ 
  theme(strip.text = element_blank())

# row1<-cowplot::plot_grid(participants_heatmap_omicron, MAE_heatmap_omicron, labels=c("A", "B"), ncol=2, align="hv", axis = "tb")
# row2<-cowplot::plot_grid(MAE_map_omicron, MAE_rankings_omicron, labels=c("C", "D"), ncol=2, align="hv", axis = "l")
# cowplot::plot_grid(row1, row2, ncol=1, align="hv", axis= "tb", rel_heights = c(2.5,1))
# cowplot::plot_grid(MAE_map_omicron, MAE_hist, ncol=2, axis = "l")

col2<- cowplot::plot_grid(MAE_map_omicron, MAE_rankings_omicron, labels=c("B", "C"), ncol=1, align="hv", axis = "l")
cowplot::plot_grid(MAE_heatmap_omicron, col2, ncol=2, labels=c("A", ""), axis = "l", rel_widths = c(2.5,1))

# cowplot::plot_grid(participants_heatmap_omicron, MAE_heatmap_omicron, col2, ncol=3, labels=c("A", "B", ""), axis = "l", rel_widths = c(1,1, .5))

omicron_winner %>% ungroup() %>% count(model) %>% kable
model n
CA-baseline 18
covid_nearterm 14
m.proj 18
simple 5
omicron_ranks %>% left_join(regions, by= "county") %>%  group_by(forecaster) %>%
  mutate(med = median(fraction_rank), avg= mean(fraction_rank)) %>% select(forecaster, med, avg) %>% unique() %>% kable()
forecaster med avg
Columbia NA NA
COVID Nearterm 0.7500000 0.7523077
Simple Growth 0.6666667 0.5726212
Ensemble 0.6666667 0.6626515
LEMMA 0.4000000 0.3891304
CA-baseline 0.6666667 0.6514545

Number of forecasts participants by date

participants_heatmap_alpha<-alpha_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index),  fill=as.factor(models_per_day)))+
  geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
  scale_fill_manual(name = "Number of Available Forecasts",  values=num_forecasts_col)+
  scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
  ylab("County")+
  #guides(fill="none")+  
  theme_classic()+ 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+  ggtitle("Best daily performing model by county as measured by MAE")

legend <- cowplot::get_legend(
  # create some space to the left of the legend
  participants_heatmap_alpha
)

participants_heatmap_alpha<- participants_heatmap_alpha+ guides(fill="none")

participants_heatmap_delta<-delta_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index),  fill=as.factor(models_per_day)))+
  geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
  scale_fill_manual(name = "Number of/Available Forecasts",  values=num_forecasts_col)+
  scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
  ylab("")+
  guides(fill="none")+  
  theme_classic()+ 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "bottom")#+  ggtitle("Best daily performing model by county as measured by MAE")


participants_heatmap_omicron <-omicron_daily_winner %>% left_join(regions, by= "county") %>% ggplot(aes(y=as.factor(county), x=as.Date(index),  fill=as.factor(models_per_day)))+
  geom_tile(color="white", size=0.1) + facet_grid(region_acronym~. , scale= "free")+
  scale_fill_manual(name = "Number of Available Forecasts",  values=num_forecasts_col)+
  scale_x_date(date_breaks = "7 days", date_labels = "%b %d", name="Publication Date")+
  ylab("")+
  # guides(fill=guide_legend(title.position="left", nrow=2))+
   guides(fill="none")+  
  theme_classic()+ 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(size=8), legend.position = "right")#+  ggtitle("Best daily performing model by county as measured by MAE")


# cowplot::plot_grid(participants_heatmap_alpha, participants_heatmap_delta, participants_heatmap_omicron, labels="AUTO", ncol=3, align="hv", axis = "tb")
row1<- cowplot::plot_grid(participants_heatmap_alpha, participants_heatmap_delta, participants_heatmap_omicron, labels="AUTO", ncol=3, align="hv", axis = "tb")
cowplot::plot_grid(row1, legend, ncol=1, rel_heights=c(3,0.5))

Train Random Forest Classification

## Random Forest 
## 
## 13741 samples
##    13 predictor
##     6 classes: 'simple', 'lemma', 'CA-baseline', 'm.proj', 'covid_nearterm', 'columbia' 
## 
## Pre-processing: centered (13), scaled (13) 
## Resampling: Cross-Validated (4 fold, repeated 4 times) 
## Summary of sample sizes: 10306, 10305, 10307, 10305, 10306, 10306, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.6660359  0.5725530
##    7    0.6780623  0.5901373
##   13    0.6687472  0.5780590
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 7.

## rf variable importance
## 
##                Overall
## perc_COVIDvx  100.0000
## r_eff          71.1164
## diff           63.5219
## pop_size       38.8639
## delta_frac     30.2719
## other_frac     21.3506
## alpha_frac     19.7568
## gamma_frac     10.0062
## income          2.5827
## perc_unemploy   2.1010
## education       1.2531
## omicron_frac    0.6508
## perc_pov        0.0000

Relative error

score_cards_tourney_rel <- score_cards_tourney %>% mutate(relative_error= predicted-actual, norm_relative_error= relative_error*100/hosp_capacity)

rel_error_7day<- score_cards_tourney_rel %>% filter(ahead==7) %>% mutate(variant_period= 
                    case_when(index > as.Date(alpha_start) & index < as.Date(alpha_end) ~"Alpha",
                              index > as.Date(delta_start) & index < as.Date(delta_end) ~"Delta",
                              index > as.Date(omicron_start) & index < as.Date(omicron_end) ~"Omicron")) #%>% drop_na(variant_period)

rel_error_14day<- score_cards_tourney_rel %>% filter(ahead==14) %>% mutate(variant_period= 
                    case_when(index > as.Date(alpha_start) & index < as.Date(alpha_end) ~"Alpha",
                              index > as.Date(delta_start) & index < as.Date(delta_end) ~"Delta",
                              index > as.Date(omicron_start) & index < as.Date(omicron_end) ~"Omicron")) %>% drop_na(variant_period)

rel_error_21day<- score_cards_tourney_rel %>% filter(ahead==21) %>% mutate(variant_period= 
                    case_when(index > as.Date(alpha_start) & index < as.Date(alpha_end) ~"Alpha",
                              index > as.Date(delta_start) & index < as.Date(delta_end) ~"Delta",
                              index > as.Date(omicron_start) & index < as.Date(omicron_end) ~"Omicron")) %>% drop_na(variant_period)
rel_error<- rel_error_7day %>% bind_rows(rel_error_14day) %>% bind_rows(rel_error_21day)

alpha_rel_error_7day <- rel_error_7day %>% filter(index > as.Date(alpha_start) & index < as.Date(alpha_end)) %>% ggplot(aes(x=norm_relative_error, y=forecaster, fill=forecaster))+ geom_density_ridges()+
   scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)]) + 
  theme_classic()+ xlab("Normalized Relative Error (%)")+ ylab("Model")+ guides(fill="none")+ theme(strip.text = element_blank())+ ggtitle("Alpha: 7-Day Relative Error")

delta_rel_error_7day <- rel_error_7day %>% filter(index > as.Date(delta_start) & index < as.Date(delta_end)) %>% ggplot(aes(x=norm_relative_error, y=forecaster, fill=forecaster))+ geom_density_ridges()+
   scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(alpha_rel_error_7day$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(alpha_rel_error_7day$forecaster)]) + 
  theme_classic()+ xlab("Normalized Relative Error (%)")+ ylab("Model")+ guides(fill="none")+ theme(strip.text = element_blank())+ ggtitle("Delta: 7-Day Relative Error")

omicron_rel_error_14day <- rel_error_14day %>% filter(index > as.Date(omicron_start) & index < as.Date(omicron_end)) %>% ggplot(aes(x=norm_relative_error, y=forecaster, fill=forecaster))+ geom_density_ridges()+
   scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(rel_error_14day$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(rel_error_14day$forecaster)]) + 
  theme_classic()+ xlab("Normalized Relative Error (%)")+ ylab("Model")+ guides(fill="none")+ theme(strip.text = element_blank())+ ggtitle("Omicron: 14-Day Relative Error")

rel_error_7day %>% ggplot(aes(x=norm_relative_error, y=forecaster, fill=forecaster))+ geom_density_ridges()+
   scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)]) + 
  theme_classic()+ xlab("Normalized Relative Error (%)")+ ylab("Model")+ guides(fill="none")+ geom_vline(xintercept= 0, lty="dashed") + ggtitle("7-Day Relative Error")

rel_error_7day %>% ggplot(aes(x=norm_relative_error, y=forecaster, fill=forecaster))+ geom_density_ridges()+ facet_wrap(county ~.)+
   scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)]) + 
  theme_classic()+ xlab("Normalized Relative Error (%)")+ ylab("Model")+ guides(fill="none")+ geom_vline(xintercept= 0, lty="dashed") + ggtitle("7-Day Relative Error")

rel_error_14day %>% ggplot(aes(x=norm_relative_error, y=forecaster, fill=forecaster))+ geom_density_ridges()+ facet_wrap(variant_period~.)+
   scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)]) + 
  theme_classic()+ xlab("Normalized Relative Error (%)")+ ylab("Model")+ guides(fill="none")+ geom_vline(xintercept= 0, lty="dashed") + ggtitle("14-Day Relative Error")

rel_error_21day %>% ggplot(aes(x=norm_relative_error, y=forecaster, fill=forecaster))+ geom_density_ridges()+
   scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(alpha_rel_error_7day$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(alpha_rel_error_7day$forecaster)]) + 
  theme_classic()+ xlab("Normalized Relative Error (%)")+ ylab("Model")+ guides(fill="none")+ geom_vline(xintercept= 0, lty="dashed") + ggtitle("21-Day Relative Error")

rel_error %>% drop_na(variant_period) %>% ggplot(aes(x=norm_relative_error, y=forecaster, fill=forecaster))+ facet_grid(variant_period~ ahead, scale="free")+ geom_density_ridges(rel_min_height = 0.01) + geom_vline(xintercept=0, lty="dashed")+
   scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)]) + 
  theme_classic()+ xlab("Normalized Relative Error (%)")+ ylab("Model")+ guides(fill="none")+ xlim(-75,200)

A7day<-rel_error_7day %>% mutate(forecast_date=as.Date(forecast_date)) %>% group_by(forecast_date, forecaster) %>% summarise(sd=sd(norm_relative_error, na.rm=TRUE), norm_relative_error=mean(norm_relative_error, na.rm=TRUE), ymin= norm_relative_error-sd, ymax= norm_relative_error+sd) %>% ggplot() + geom_path(aes(x=forecast_date, y=norm_relative_error, col= forecaster))+ geom_ribbon(aes(x= forecast_date, ymin=ymin, ymax=ymax, fill= forecaster), alpha=0.2)+ theme_classic()+ facet_wrap(forecaster~.)+
  scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)])+
    scale_color_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)])+ xlab("Publication Date")+ ylab("Normalized Relative Error (%)") + geom_hline(yintercept= 0, lty="dashed")+ guides(fill="none",col="none")

B14day<-rel_error_14day %>% mutate(forecast_date=as.Date(forecast_date)) %>% group_by(forecast_date, forecaster) %>% summarise(sd=sd(norm_relative_error, na.rm=TRUE), norm_relative_error=mean(norm_relative_error, na.rm=TRUE), ymin= norm_relative_error-sd, ymax= norm_relative_error+sd) %>% ggplot() + geom_path(aes(x=forecast_date, y=norm_relative_error, col= forecaster))+ geom_ribbon(aes(x= forecast_date, ymin=ymin, ymax=ymax, fill= forecaster), alpha=0.2)+ theme_classic()+ facet_wrap(forecaster~.)+
  scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)])+
    scale_color_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)])+ xlab("Publication Date")+ ylab("Normalized Relative Error (%)") + geom_hline(yintercept= 0, lty="dashed")


C21day<-rel_error_21day %>% mutate(forecast_date=as.Date(forecast_date)) %>% group_by(forecast_date, forecaster) %>% summarise(sd=sd(norm_relative_error, na.rm=TRUE), norm_relative_error=mean(norm_relative_error, na.rm=TRUE), ymin= norm_relative_error-sd, ymax= norm_relative_error+sd) %>% ggplot() + geom_path(aes(x=forecast_date, y=norm_relative_error, col= forecaster))+ geom_ribbon(aes(x= forecast_date, ymin=ymin, ymax=ymax, fill= forecaster), alpha=0.2)+ theme_classic()+ facet_wrap(forecaster~.)+
  scale_fill_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)])+
    scale_color_manual(name = "Model:", labels=names(forecaster_map_col)[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)], values=forecaster_map_col[names(forecaster_map_col) %in% unique(rel_error_7day$forecaster)])+ xlab("Publication Date")+ ylab("Normalized Relative Error (%)")+ geom_hline(yintercept= 0, lty="dashed") + guides(fill="none",col="none")

cowplot::plot_grid(A7day, B14day, C21day, ncol=1, labels= "AUTO", align = "v", axis = "lr")

Plot RF results

SI Figure 23

cowplot::plot_grid(var_imp_7day, var_imp_14day, var_imp_21day, labels="AUTO", ncol= 1)

Compare ensemble performance across counties

SI Figure 24

library(forcats)
library(equatiomatic)

ensemble_scores<- score_cards_tourney %>% filter(forecaster=="Ensemble", ahead==1) %>% left_join(regions, by="county")

#Individual counties colored by region x-axis ordered by county population size
#7 day
A_7day<- ensemble_scores %>% ggplot(aes(x=fct_reorder(county, dof_pop_county), y=norm_MAE_7day)) + 
  geom_boxplot(aes(fill=region_acronym)) + 
  scale_y_continuous(trans='log10') + 
  # facet_grid(fct_reorder(region_acronym, dof_pop_county) ~.,scales="free")+
  scale_fill_manual(name = "Health Officer Regions", values= region_col_abr)+
  theme_classic()+ ylab("Normalized 7-day MAE (log scale)")+ xlab("County (in order of increasing population size)")+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), legend.position = "right") + geom_smooth(method="lm", se=TRUE, aes(group=1), col="black", linetype="dashed") #+ #guides(fill="none") #+ 

legend <- cowplot::get_legend(
  # create some space to the left of the legend
  A_7day
)

A_7day<- A_7day+ guides(fill="none")

#14 day
B_14day <- ensemble_scores %>% ggplot(aes(x=fct_reorder(county, dof_pop_county), y=norm_MAE_14day)) + 
  geom_boxplot(aes(fill=region_acronym)) + 
  scale_y_continuous(trans='log10') + 
  # facet_grid(fct_reorder(region_acronym, dof_pop_county) ~.,scales="free")+
  scale_fill_manual(name = "Health Officer Regions", values= region_col_abr)+
  theme_classic()+ ylab("Normalized 14-day MAE (log scale)")+ xlab("County (in order of increasing population size)")+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + geom_smooth(method="lm", se=TRUE, aes(group=1), col="black", linetype="dashed") + guides(fill="none")

#21 day
C_21day<- ensemble_scores %>% ggplot(aes(x=fct_reorder(county, dof_pop_county), y=norm_MAE_21day)) + 
  geom_boxplot(aes(fill=region_acronym)) + 
  scale_y_continuous(trans='log10') + 
  # facet_grid(fct_reorder(region_acronym, dof_pop_county) ~.,scales="free")+
  scale_fill_manual(name = "Health Officer Regions", values= region_col_abr)+
  theme_classic()+ ylab("Normalized 21-day MAE (log scale)")+ xlab("County (in order of increasing population size)")+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + geom_smooth(method="lm", se=TRUE, aes(group=1), col="black", linetype="dashed") + guides(fill="none") 


cowplot::plot_grid(A_7day, B_14day, C_21day, legend, labels=c("A", "B", "C", ""), ncol=2)

#Individual counties colored by region x-axis ordered by median norm_MAE
ensemble_scores %>% group_by(county) %>% mutate(median_norm_MAE=median(norm_MAE_14day)) %>% ggplot(aes(x=fct_reorder(county, median_norm_MAE), y=norm_MAE_14day)) + 
  geom_boxplot(aes(fill=region_acronym)) + 
  scale_y_continuous(trans='log10') + 
  # facet_grid(fct_reorder(region_acronym, dof_pop_county) ~.,scales="free")+
  scale_fill_manual(name = "Health Officer Regions", values= region_col_abr)+
  theme_classic()+ ylab("Normalized Mean Absolute Error (log scale)")+ xlab("County (in order of median normalized MAE)")+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + geom_smooth(method="lm", se=TRUE, aes(group=1), col="black", linetype="dashed") #+ #guides(fill="none") #+ 

ensemble_scores %>% ggplot(aes(x=fct_reorder(county, dof_pop_county), y=norm_MAE_14day, fill=region_acronym)) + 
  geom_boxplot() + 
  #scale_y_continuous(trans='log10') + 
  # facet_grid(fct_reorder(region_acronym, dof_pop_county) ~.,scales="free")+
  scale_fill_manual(name = "Health Officer Regions", values= region_col_abr)+
  theme_classic()+ ylab("Normalized Mean Absolute Error")+ xlab("County (in order of increasing population size)")+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + geom_smooth(method="lm", se=TRUE, aes(group=1), col="black", linetype="dashed") #+ #guides(fill="none") #+ 

#use log of norm_MAE because of increasing variance for smaller counties
county_lm<-lm(log(norm_MAE_14day) ~fct_reorder(county, dof_pop_county), data=ensemble_scores)
summary(county_lm)
## 
## Call:
## lm(formula = log(norm_MAE_14day) ~ fct_reorder(county, dof_pop_county), 
##     data = ensemble_scores)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6581 -0.6717 -0.0346  0.6562  3.4809 
## 
## Coefficients:
##                                                      Estimate Std. Error
## (Intercept)                                         1.458e+00  5.113e-02
## fct_reorder(county, dof_pop_county)Trinity         -9.580e-01  7.230e-02
## fct_reorder(county, dof_pop_county)Mono            -1.895e-02  7.230e-02
## fct_reorder(county, dof_pop_county)Mariposa         7.935e-01  7.230e-02
## fct_reorder(county, dof_pop_county)Inyo            -1.894e-01  7.230e-02
## fct_reorder(county, dof_pop_county)Plumas          -4.762e-01  7.230e-02
## fct_reorder(county, dof_pop_county)Colusa           5.116e-02  7.230e-02
## fct_reorder(county, dof_pop_county)Del Norte       -3.674e-01  7.230e-02
## fct_reorder(county, dof_pop_county)Glenn           -2.328e-01  7.230e-02
## fct_reorder(county, dof_pop_county)Lassen           5.908e-01  7.230e-02
## fct_reorder(county, dof_pop_county)Amador          -3.344e-01  7.230e-02
## fct_reorder(county, dof_pop_county)Siskiyou        -2.957e-01  7.230e-02
## fct_reorder(county, dof_pop_county)Calaveras        6.350e-01  7.230e-02
## fct_reorder(county, dof_pop_county)Tuolumne        -2.806e-01  7.230e-02
## fct_reorder(county, dof_pop_county)San Benito       6.099e-01  7.230e-02
## fct_reorder(county, dof_pop_county)Lake             5.188e-05  7.230e-02
## fct_reorder(county, dof_pop_county)Tehama           4.292e-02  7.230e-02
## fct_reorder(county, dof_pop_county)Yuba            -4.802e-01  7.230e-02
## fct_reorder(county, dof_pop_county)Mendocino       -2.609e-01  7.230e-02
## fct_reorder(county, dof_pop_county)Nevada          -4.548e-01  7.230e-02
## fct_reorder(county, dof_pop_county)Humboldt        -9.072e-01  7.230e-02
## fct_reorder(county, dof_pop_county)Napa            -4.355e-01  7.230e-02
## fct_reorder(county, dof_pop_county)Kings           -5.922e-02  7.230e-02
## fct_reorder(county, dof_pop_county)Madera          -9.468e-01  7.230e-02
## fct_reorder(county, dof_pop_county)Shasta          -1.022e+00  7.230e-02
## fct_reorder(county, dof_pop_county)Imperial        -6.776e-01  7.230e-02
## fct_reorder(county, dof_pop_county)El Dorado        9.984e-02  7.230e-02
## fct_reorder(county, dof_pop_county)Butte           -9.799e-01  7.230e-02
## fct_reorder(county, dof_pop_county)Yolo             1.394e-02  7.230e-02
## fct_reorder(county, dof_pop_county)Marin           -8.099e-01  7.230e-02
## fct_reorder(county, dof_pop_county)Santa Cruz      -8.012e-01  7.230e-02
## fct_reorder(county, dof_pop_county)San Luis Obispo -7.050e-01  7.230e-02
## fct_reorder(county, dof_pop_county)Merced          -2.327e-01  7.230e-02
## fct_reorder(county, dof_pop_county)Placer          -7.814e-01  7.230e-02
## fct_reorder(county, dof_pop_county)Solano          -1.015e+00  7.230e-02
## fct_reorder(county, dof_pop_county)Monterey        -1.566e+00  7.230e-02
## fct_reorder(county, dof_pop_county)Santa Barbara   -1.052e+00  7.230e-02
## fct_reorder(county, dof_pop_county)Tulare          -7.310e-01  7.230e-02
## fct_reorder(county, dof_pop_county)Sonoma          -9.660e-01  7.230e-02
## fct_reorder(county, dof_pop_county)Stanislaus      -7.499e-01  7.230e-02
## fct_reorder(county, dof_pop_county)San Mateo       -9.991e-01  7.230e-02
## fct_reorder(county, dof_pop_county)San Joaquin     -9.727e-01  7.230e-02
## fct_reorder(county, dof_pop_county)Ventura         -1.188e+00  7.230e-02
## fct_reorder(county, dof_pop_county)San Francisco   -2.103e+00  7.230e-02
## fct_reorder(county, dof_pop_county)Kern            -7.714e-01  7.230e-02
## fct_reorder(county, dof_pop_county)Fresno          -6.523e-01  7.230e-02
## fct_reorder(county, dof_pop_county)Contra Costa    -1.125e+00  7.230e-02
## fct_reorder(county, dof_pop_county)Sacramento      -1.314e+00  7.230e-02
## fct_reorder(county, dof_pop_county)Alameda         -1.642e+00  7.230e-02
## fct_reorder(county, dof_pop_county)Santa Clara     -1.666e+00  7.230e-02
## fct_reorder(county, dof_pop_county)San Bernardino  -8.167e-01  7.230e-02
## fct_reorder(county, dof_pop_county)Riverside       -1.239e+00  7.230e-02
## fct_reorder(county, dof_pop_county)Orange          -1.421e+00  7.230e-02
## fct_reorder(county, dof_pop_county)San Diego       -1.601e+00  7.230e-02
## fct_reorder(county, dof_pop_county)Los Angeles     -1.301e+00  7.230e-02
##                                                    t value Pr(>|t|)    
## (Intercept)                                         28.521  < 2e-16 ***
## fct_reorder(county, dof_pop_county)Trinity         -13.249  < 2e-16 ***
## fct_reorder(county, dof_pop_county)Mono             -0.262 0.793277    
## fct_reorder(county, dof_pop_county)Mariposa         10.975  < 2e-16 ***
## fct_reorder(county, dof_pop_county)Inyo             -2.619 0.008816 ** 
## fct_reorder(county, dof_pop_county)Plumas           -6.586 4.64e-11 ***
## fct_reorder(county, dof_pop_county)Colusa            0.708 0.479226    
## fct_reorder(county, dof_pop_county)Del Norte        -5.082 3.78e-07 ***
## fct_reorder(county, dof_pop_county)Glenn            -3.220 0.001286 ** 
## fct_reorder(county, dof_pop_county)Lassen            8.171 3.24e-16 ***
## fct_reorder(county, dof_pop_county)Amador           -4.625 3.77e-06 ***
## fct_reorder(county, dof_pop_county)Siskiyou         -4.090 4.33e-05 ***
## fct_reorder(county, dof_pop_county)Calaveras         8.783  < 2e-16 ***
## fct_reorder(county, dof_pop_county)Tuolumne         -3.881 0.000104 ***
## fct_reorder(county, dof_pop_county)San Benito        8.435  < 2e-16 ***
## fct_reorder(county, dof_pop_county)Lake              0.001 0.999427    
## fct_reorder(county, dof_pop_county)Tehama            0.594 0.552825    
## fct_reorder(county, dof_pop_county)Yuba             -6.642 3.19e-11 ***
## fct_reorder(county, dof_pop_county)Mendocino        -3.608 0.000309 ***
## fct_reorder(county, dof_pop_county)Nevada           -6.291 3.23e-10 ***
## fct_reorder(county, dof_pop_county)Humboldt        -12.547  < 2e-16 ***
## fct_reorder(county, dof_pop_county)Napa             -6.023 1.74e-09 ***
## fct_reorder(county, dof_pop_county)Kings            -0.819 0.412741    
## fct_reorder(county, dof_pop_county)Madera          -13.095  < 2e-16 ***
## fct_reorder(county, dof_pop_county)Shasta          -14.133  < 2e-16 ***
## fct_reorder(county, dof_pop_county)Imperial         -9.372  < 2e-16 ***
## fct_reorder(county, dof_pop_county)El Dorado         1.381 0.167339    
## fct_reorder(county, dof_pop_county)Butte           -13.553  < 2e-16 ***
## fct_reorder(county, dof_pop_county)Yolo              0.193 0.847101    
## fct_reorder(county, dof_pop_county)Marin           -11.202  < 2e-16 ***
## fct_reorder(county, dof_pop_county)Santa Cruz      -11.080  < 2e-16 ***
## fct_reorder(county, dof_pop_county)San Luis Obispo  -9.750  < 2e-16 ***
## fct_reorder(county, dof_pop_county)Merced           -3.218 0.001291 ** 
## fct_reorder(county, dof_pop_county)Placer          -10.808  < 2e-16 ***
## fct_reorder(county, dof_pop_county)Solano          -14.042  < 2e-16 ***
## fct_reorder(county, dof_pop_county)Monterey        -21.662  < 2e-16 ***
## fct_reorder(county, dof_pop_county)Santa Barbara   -14.553  < 2e-16 ***
## fct_reorder(county, dof_pop_county)Tulare          -10.110  < 2e-16 ***
## fct_reorder(county, dof_pop_county)Sonoma          -13.360  < 2e-16 ***
## fct_reorder(county, dof_pop_county)Stanislaus      -10.371  < 2e-16 ***
## fct_reorder(county, dof_pop_county)San Mateo       -13.818  < 2e-16 ***
## fct_reorder(county, dof_pop_county)San Joaquin     -13.453  < 2e-16 ***
## fct_reorder(county, dof_pop_county)Ventura         -16.437  < 2e-16 ***
## fct_reorder(county, dof_pop_county)San Francisco   -29.092  < 2e-16 ***
## fct_reorder(county, dof_pop_county)Kern            -10.669  < 2e-16 ***
## fct_reorder(county, dof_pop_county)Fresno           -9.022  < 2e-16 ***
## fct_reorder(county, dof_pop_county)Contra Costa    -15.565  < 2e-16 ***
## fct_reorder(county, dof_pop_county)Sacramento      -18.171  < 2e-16 ***
## fct_reorder(county, dof_pop_county)Alameda         -22.709  < 2e-16 ***
## fct_reorder(county, dof_pop_county)Santa Clara     -23.048  < 2e-16 ***
## fct_reorder(county, dof_pop_county)San Bernardino  -11.295  < 2e-16 ***
## fct_reorder(county, dof_pop_county)Riverside       -17.131  < 2e-16 ***
## fct_reorder(county, dof_pop_county)Orange          -19.657  < 2e-16 ***
## fct_reorder(county, dof_pop_county)San Diego       -22.148  < 2e-16 ***
## fct_reorder(county, dof_pop_county)Los Angeles     -17.991  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9551 on 19140 degrees of freedom
##   (715 observations deleted due to missingness)
## Multiple R-squared:  0.2971, Adjusted R-squared:  0.2951 
## F-statistic: 149.8 on 54 and 19140 DF,  p-value: < 2.2e-16
equatiomatic::extract_eq(county_lm)

\[ \operatorname{log(norm\_MAE\_14day)} = \alpha + \beta_{1}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Trinity}}) + \beta_{2}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Mono}}) + \beta_{3}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Mariposa}}) + \beta_{4}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Inyo}}) + \beta_{5}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Plumas}}) + \beta_{6}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Colusa}}) + \beta_{7}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Del\ Norte}}) + \beta_{8}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Glenn}}) + \beta_{9}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Lassen}}) + \beta_{10}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Amador}}) + \beta_{11}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Siskiyou}}) + \beta_{12}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Calaveras}}) + \beta_{13}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Tuolumne}}) + \beta_{14}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{San\ Benito}}) + \beta_{15}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Lake}}) + \beta_{16}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Tehama}}) + \beta_{17}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Yuba}}) + \beta_{18}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Mendocino}}) + \beta_{19}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Nevada}}) + \beta_{20}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Humboldt}}) + \beta_{21}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Napa}}) + \beta_{22}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Kings}}) + \beta_{23}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Madera}}) + \beta_{24}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Shasta}}) + \beta_{25}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Imperial}}) + \beta_{26}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{El\ Dorado}}) + \beta_{27}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Butte}}) + \beta_{28}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Yolo}}) + \beta_{29}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Marin}}) + \beta_{30}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Santa\ Cruz}}) + \beta_{31}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{San\ Luis\ Obispo}}) + \beta_{32}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Merced}}) + \beta_{33}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Placer}}) + \beta_{34}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Solano}}) + \beta_{35}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Monterey}}) + \beta_{36}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Santa\ Barbara}}) + \beta_{37}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Tulare}}) + \beta_{38}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Sonoma}}) + \beta_{39}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Stanislaus}}) + \beta_{40}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{San\ Mateo}}) + \beta_{41}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{San\ Joaquin}}) + \beta_{42}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Ventura}}) + \beta_{43}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{San\ Francisco}}) + \beta_{44}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Kern}}) + \beta_{45}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Fresno}}) + \beta_{46}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Contra\ Costa}}) + \beta_{47}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Sacramento}}) + \beta_{48}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Alameda}}) + \beta_{49}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Santa\ Clara}}) + \beta_{50}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{San\ Bernardino}}) + \beta_{51}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Riverside}}) + \beta_{52}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Orange}}) + \beta_{53}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{San\ Diego}}) + \beta_{54}(\operatorname{fct\_reorder(county,\ dof\_pop\_county)}_{\operatorname{Los\ Angeles}}) + \epsilon \]

pop_lm<-lm(log(norm_MAE_14day) ~dof_pop_county, data=ensemble_scores)
summary(pop_lm)
## 
## Call:
## lm(formula = log(norm_MAE_14day) ~ dof_pop_county, data = ensemble_scores)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.4127 -0.7558 -0.0203  0.7335  4.5449 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     9.584e-01  8.838e-03  108.44   <2e-16 ***
## dof_pop_county -1.818e-07  5.258e-09  -34.58   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.104 on 19193 degrees of freedom
##   (715 observations deleted due to missingness)
## Multiple R-squared:  0.05865,    Adjusted R-squared:  0.0586 
## F-statistic:  1196 on 1 and 19193 DF,  p-value: < 2.2e-16
equatiomatic::extract_eq(pop_lm)

\[ \operatorname{log(norm\_MAE\_14day)} = \alpha + \beta_{1}(\operatorname{dof\_pop\_county}) + \epsilon \]

#Grouped by regions
ensemble_scores %>% ggplot(aes(x=dof_pop_county, y=norm_MAE_14day)) + 
  geom_boxplot(aes(fill=region_acronym)) + 
  geom_smooth(method='lm')+
  scale_y_continuous(trans='log10') + 
  scale_x_continuous(trans='log10') + 
    scale_fill_manual(name = "Health Officer Regions", values= region_col_abr)+
  # facet_grid(fct_reorder(region_acronym, dof_pop_county) ~.,scales="free")+
  #scale_fill_manual(name = "Health Officer Regions", values= met.brewer("Hokusai1", 5))+
  theme_classic()+ xlab("Population Size")+ ylab("Normalized MAE")# + guides(fill="none") #+ theme(strip.text = element_blank())

#Individual counties with facet grid split by regions
ensemble_scores %>% ggplot(aes(x=norm_MAE_14day, y=fct_reorder(county, dof_pop_county), fill=region_acronym)) + 
  geom_boxplot() + 
  scale_x_continuous(trans='log10') + 
  geom_smooth(method='lm', orientation="y", aes(group=1), col="black", linetype="dashed", se=TRUE)+
  facet_grid(fct_reorder(region_acronym, dof_pop_county) ~.,scales="free")+
  scale_fill_manual(name = "Health Officer Regions", values= region_col_abr)+
  theme_classic()+ xlab("Normalized Mean Absolute Error")+ ylab("County")+ guides(fill="none") #+ theme(strip.text = element_blank())

max_models %>% group_by(county) %>% left_join(regions, by="county") %>% ggplot(aes(x=fct_reorder(county, dof_pop_county), y= models_per_day)) + geom_boxplot() + theme_classic()